###########################################################
# Kerice Doten-Snitker
# 
# Annualize Settlement Data - prep and cleaning
# 
# Built under R 3.4.3 
# Platform: x86_64-apple-darwin15.6.0 (64-bit)
###########################################################

# set locale to preempt UTF-8 issues with the mydata
#Sys.getlocale()
#Sys.setlocale("LC_CTYPE", "en_US.UTF-8")

#for windows
#Sys.setlocale(category = "LC_ALL", locale = "English_United States.1252")

# add packages with the pacman library, which installs if not installed
library(pacman)
# use the here package for improving replicability
p_load(here)
# for data manipulation
p_load(magrittr, plyr, tidyverse, reshape2)

set.seed(1418)

# will need to consider sensitivity to:
# whether and how early to pre-date settlement before a persecution
# successful vs attempted prior expulsions elsewhere
# length of "previous" period - how far back to look

###########################################################
### Observations are settlements
### with certain Jewish presence, that is, Juden!=88,
### AND no nichtstaedtische settlements.
###########################################################

# add the annualized data to the R environment 
mydata_a <- read.csv(here::here("in_data","Haverkamp_Ortskatalog_limited_resettlement.csv"), 
  header=TRUE, encoding = "latin1", stringsAsFactors=FALSE)

#length(unique(mydata_a$PlaceID))
#length(mydata_a$PlaceID[!is.na(mydata_a$JEntryYr1)])

# recode from 99s to NA and strip the rows with no year data
# mydata_a2[mydata_a2==99] <- NA # includes IDs - yikes!
IDs <- c("PlaceID")
mydata_a2 <- mydata_a %>% 
  mutate_at(.vars=vars(-one_of(IDs)), .funs=funs(ifelse(. == 99, NA, .))) %>%
  filter(!is.na(JEntryYr1))

# lose any settlements when omitting empty rows? 3 that had 99's
#setdiff(unique(mydata_a$PlaceID), unique(mydata_a2$PlaceID)) 

###########################################################
### Task: Change from year ranges to having all years for each settlement
###########################################################

#####
# 1: Change from wide data to long ----------------------------------------
#####

# I want to end up with [PlaceName, PlaceID, Entry, Entrym, Exit, Exitm]
# code found here: https://stackoverflow.com/questions/12466493/reshaping-multiple-sets-of-measurement-columns-wide-format-into-single-columns
mydata_a3 <-reshape(mydata_a2, idvar=c("PlaceID", "PlaceName"), 
            direction="long", 
             varying=list(Start=c(3,7,11,15,19,23), 
               StartM=c(4,8,12,16,20,24), 
               End=c(5,9,13,17,21,25), EndM=c(6,10,14,18,22,26)),
             v.names = c("EntryYr", "EntryYrM", "ExitYr", "ExitYrM"))
#str(mydata_a3)
mydata_a4 <- mydata_a3[!is.na(mydata_a3$EntryYr),]
nrow(mydata_a4)
length(unique(mydata_a3$PlaceID))
mydata_a4 <- mydata_a4[order(mydata_a4$PlaceID),]

#####
# 2: Adjust for censoring and missingness ----------------------------------
#####
summary(mydata_a4$EntryYr)
summary(mydata_a4$ExitYr)
table(mydata_a4$ExitYrM, useNA="ifany")
table(mydata_a4$ExitYrM[mydata_a4$ExitYr==1520], useNA="ifany")
# 127 potential cases of right censoring - not of interest for this study

table(mydata_a4$EntryYrM, useNA="ifany")
# 6 cases of left censoring where settlement preceded study period
# 293 cases of where settlement entry date before persecution is unknown

table(mydata_a4$EntryYrM[mydata_a4$EntryYr==1288|mydata_a4$EntryYr==1289], useNA="ifany") # 2 obs start with Rintfleisch persecution
table(mydata_a4$EntryYrM[mydata_a4$EntryYr==1336|mydata_a4$EntryYr==1337], useNA="ifany") # 15 start with Armleder persecution
table(mydata_a4$EntryYrM[mydata_a4$EntryYr==1348|mydata_a4$EntryYr==1349|mydata_a4$EntryYr==1350], useNA="ifany") # 151 start with Plague persecution

# what's the length of residency before a Plague persecution?
# construct dummy: where end with Plague but have prior record of settlement
tenure <- mydata_a4
tenure$Plague <- rep(0, nrow(tenure))
tenure$Plague[tenure$EntryYrM!=2&tenure$ExitYr==1348] <- 1
tenure$Plague[tenure$EntryYrM!=2&tenure$ExitYr==1349] <- 1
tenure$Plague[tenure$EntryYrM!=2&tenure$ExitYr==1350] <- 1
summary(tenure$Plague)
# calculate pre-Plague tenure
tenure$prePlague <- tenure$ExitYr - tenure$EntryYr
tenure$prePlague[tenure$Plague==0] <- NA
summary(tenure$prePlague)
hist(tenure$prePlague, breaks = c(seq(0, 350, 10)))
# spikes with starting at 1301, 1251, 1201
# but unlikely that Plague was 1st mention if long prior tenure
# and Jewish migration into the region didn't take off until 13th c
# in absence of any other record, assume recent migration
# cut where tenure longer than 50, and redo distribution summary
tenure$prePlague2 <- tenure$prePlague
tenure$prePlague2[tenure$prePlague>50] <- NA
summary(tenure$prePlague2) # 1st Quartile is 22 - take that?
hist(tenure$prePlague2, breaks = c(seq(0, 50, 1)))
# what's the migration pattern for settlement in 1301-1350 (13A)?
# identify where in-migration occurred in 13A, 
#but exclude where 1st evidence is persecution (Armleder, Plague)
tenure$Entry_13A <- rep(0, nrow(tenure))
tenure$Entry_13A[tenure$EntryYrM!=2&tenure$EntryYr>=1301&tenure$EntryYr<=1350] <- 1
summary(tenure$Entry_13A)
# calculate tenure for settlment in 13A
tenure$Entry_13A_t <- 1350 - tenure$EntryYr
tenure$Entry_13A_t[tenure$Entry_13A==0] <- NA
summary(tenure$Entry_13A_t)
hist(tenure$Entry_13A_t, breaks = c(seq(0, 50, 5)))
# there doesn't seem to be a specific pattern for settlement in 13A;
# growth from 1320s on; 
# higher frequencies in 1310s and early 1320s shows French expulsions -
# settlment in HRE after these seems plausible

# add preceding buffer years where first mention is for a persecution
# try 3 years
mydata_3yr <- mydata_a4 %>%
  rownames_to_column("info") %>%
  mutate(EntryYr = ifelse(EntryYrM==2,
          (EntryYr - 2), EntryYr)) %>%
  column_to_rownames("info")
summary(mydata_3yr$EntryYr[mydata_a4$EntryYrM==2])

# try 21 years - 
# 1st Quartile of tenure when first evidence is in 13A, 
# excluding where persecution is first evidence
mydata_21yr <- mydata_a4 %>%
  rownames_to_column("info") %>%
  mutate(EntryYr = ifelse(EntryYrM==2,
          (EntryYr - 20), EntryYr)) %>%
  column_to_rownames("info")
summary(mydata_21yr$EntryYr[mydata_a4$EntryYrM==2])

#####
# 3: Change from one row per entry/exit to one row per year per settlement ---
#####

# get inclusive enumeration of years
Yrs <- list(Years=Map(seq, mydata_3yr$EntryYr, mydata_3yr$ExitYr))
Yrs$PlaceID <- mydata_3yr$PlaceID
Yrs$PlaceName <- mydata_3yr$PlaceName
names(Yrs$"Years") <- row.names(mydata_3yr)
# extract each year
# Yrs[1][[1]][1] # gets all years in the first sequence
# Yrs[2][[1]] # gets the sequence of PlaceIDs
# sapply(Yrs[2], function(x){as.numeric(x[[1]])}) # gets all PlaceIDs
Yrs2 <- unlist(Yrs[[1]])
# result has info in the rownames: PlaceID.PlaceName.obs number
# copy rownames into the data
Yrs2 <- as.data.frame(Yrs2)
Yrs2 <- rownames_to_column(Yrs2, "VALUE")
Yrs2$VALUE2<-str_replace(Yrs2$VALUE, "\\.[?=\\s]", "_")
# then split off PlaceID and PlaceName using regex in stringr
Yrs2[,3] <- as.numeric(str_extract(Yrs2[,1],"[0-9]+(?=\\.)"))
Yrs2[,4] <- str_extract(Yrs2[,1],"(?<=\\.).+(?=\\.[0-9])")
# add an observation ID by settlement-year
mydata_y<- cbind(seq(1,nrow(Yrs2),1), Yrs2[,2:4])
# set meaningful column names
colnames(mydata_y) <- c("ObsID_Yr", "Year","PlaceID","PlaceName")

nrow(mydata_y)
length(unique(mydata_y$ObsID_Yr)) # unique settlement-year obs
length(unique(mydata_y$PlaceName)) # unique names
length(unique(mydata_y$PlaceID)) # unique settlement IDs

#####
# 4: Prep for merging with periodized dataset -------------------------------
#####

# add in period identifier in order to know which covariates to attach
mydata_y$Period2 <- rep(NA, nrow(mydata_y))
mydata_y$Period2[mydata_y$Year<=1100] <- 1
mydata_y$Period2[mydata_y$Year>1100&mydata_y$Year<=1200] <- 2
mydata_y$Period2[mydata_y$Year>1200&mydata_y$Year<=1250] <- 3
mydata_y$Period2[mydata_y$Year>1250&mydata_y$Year<=1300] <- 4
mydata_y$Period2[mydata_y$Year>1300&mydata_y$Year<=1350] <- 5
mydata_y$Period2[mydata_y$Year>1350&mydata_y$Year<=1400] <- 6
mydata_y$Period2[mydata_y$Year>1400&mydata_y$Year<=1450] <- 7
mydata_y$Period2[mydata_y$Year>1450&mydata_y$Year<=1500] <- 8
mydata_y$Period2[mydata_y$Year>1500&mydata_y$Year<=1520] <- 9

# saveRDS(mydata_y, here::here("in_data", "diffusion.rds"))
